/*
 * Copyright © 2016 Red Hat.
 * Copyright © 2016 Bas Nieuwenhuizen
 *
 * based in part on anv driver which is:
 * Copyright © 2015 Intel Corporation
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 * and/or sell copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice (including the next
 * paragraph) shall be included in all copies or substantial portions of the
 * Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 * DEALINGS IN THE SOFTWARE.
 */

#include "vc4_private.h"
#include "vc4_vk_job.h"
#include "vc4_qir.h"
#include "vc4_vk_program.h"
#include "vc4_vk_formats.h"
#include "vc4_memory.h"
#include "vk_format_info.h"
#include "util/u_pack_color.h"

#define VC4_HW_2116_COUNT		0x1ef0

struct vc4_draw_info {
    uint32_t vertex_count;
    uint32_t instance_count;
    uint32_t start;
    uint32_t first_instance;

    uint8_t index_size;

    /**
     * For indexed drawing, these fields apply after index lookup.
     */
    int index_bias; /**< a bias to be added to each index */
    unsigned min_index; /**< the min index */
    unsigned max_index; /**< the max index */

    struct {
        struct vc4_buffer *resource;  /**< real buffer */
        VkDeviceSize idx_offset;
    } index;

    enum pipe_prim_type mode;
};

static const enum pipe_prim_type vk_to_pipe_prim_type[] = {
   [VK_PRIMITIVE_TOPOLOGY_POINT_LIST] = PIPE_PRIM_POINTS,
   [VK_PRIMITIVE_TOPOLOGY_LINE_LIST] = PIPE_PRIM_LINES,
   [VK_PRIMITIVE_TOPOLOGY_LINE_STRIP] = PIPE_PRIM_LINE_STRIP,
   [VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST] = PIPE_PRIM_TRIANGLES,
   [VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP] = PIPE_PRIM_TRIANGLE_STRIP,
   [VK_PRIMITIVE_TOPOLOGY_TRIANGLE_FAN] = PIPE_PRIM_TRIANGLE_FAN,
   [VK_PRIMITIVE_TOPOLOGY_LINE_LIST_WITH_ADJACENCY] = PIPE_PRIM_LINES_ADJACENCY,
   [VK_PRIMITIVE_TOPOLOGY_LINE_STRIP_WITH_ADJACENCY] = PIPE_PRIM_LINE_STRIP_ADJACENCY,
   [VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST_WITH_ADJACENCY] = PIPE_PRIM_TRIANGLES_ADJACENCY,
   [VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP_WITH_ADJACENCY] = PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY,
};

static
void vc4_context_free(struct vc4_cmd_buffer *cmd_buffer);

static bool
u_split_draw(enum pipe_prim_type mode, uint32_t max_verts,
             uint32_t *count, uint32_t *step)
{
   if (*count <= max_verts) {
      *step = *count;
      return false;
   }

   switch (mode) {
      case PIPE_PRIM_POINTS:
         *count = *step = max_verts;
         break;
      case PIPE_PRIM_LINES:
         *count = *step = max_verts - (max_verts % 2);
         break;
      case PIPE_PRIM_LINE_STRIP:
         *count = max_verts;
         *step = max_verts - 1;
         break;
      case PIPE_PRIM_LINE_LOOP:
         *count = max_verts;
         *step = max_verts - 1;
         debug_warn_once("unhandled line loop "
                         "looping behavior with "
                         ">max vert count\n");
         break;
      case PIPE_PRIM_TRIANGLES:
         *count = *step = max_verts - (max_verts % 3);
         break;
      case PIPE_PRIM_TRIANGLE_STRIP:
         *count = max_verts;
         *step = max_verts - 2;
         break;
      default:
         debug_warn_once("unhandled primitive "
                         "max vert count, truncating\n");
         *count = *step = max_verts;
   }

   return true;
}

/* FIXME: C&P from radv. tu has similar code. Perhaps common place? */
void
vc4_viewport_compute_xform(const VkViewport *viewport,
                            float scale[3],
                            float translate[3])
{
   float x = viewport->x;
   float y = viewport->y;
   float half_width = 0.5f * viewport->width;
   float half_height = 0.5f * viewport->height;
   double n = viewport->minDepth;
   double f = viewport->maxDepth;

   scale[0] = half_width;
   translate[0] = half_width + x;
   scale[1] = half_height;
   translate[1] = half_height + y;

   scale[2] = (f - n);
   translate[2] = n;

   /* It seems that if the scale is small enough the hardware won't clip
    * correctly so we work around this my choosing the smallest scale that
    * seems to work.
    *
    * This case is exercised by CTS:
    * dEQP-VK.draw.inverted_depth_ranges.nodepthclamp_deltazero
    */
   const float min_abs_scale = 0.000009f;
   if (fabs(scale[2]) < min_abs_scale)
      scale[2] = min_abs_scale * (scale[2] < 0 ? -1.0f : 1.0f);
}

static VkResult
vc4_create_cmd_buffer(struct vc4_device *device,
                     struct vc4_cmd_pool *pool,
                     VkCommandBufferLevel level,
                     VkCommandBuffer *pCommandBuffer)
{
   struct vc4_cmd_buffer *cmd_buffer;

   cmd_buffer = vk_object_zalloc(&device->vk, NULL, sizeof(*cmd_buffer),
                                 VK_OBJECT_TYPE_COMMAND_BUFFER);
   if (cmd_buffer == NULL)
      return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);

   cmd_buffer->device = device;
   cmd_buffer->pool = pool;
   cmd_buffer->level = level;

   assert(pool);
   list_addtail(&cmd_buffer->pool_link, &pool->cmd_buffers);
   cmd_buffer->queue_family_index = pool->queue_family_index;

   *pCommandBuffer = vc4_cmd_buffer_to_handle(cmd_buffer);

   return VK_SUCCESS;
}

static void
cmd_buffer_free_resources(struct vc4_cmd_buffer *cmd_buffer)
{
   struct vc4_cmd_buffer_state *state =  &cmd_buffer->state;

   /* free resources */
   /* free jobs*/
   list_for_each_entry_safe(struct vc4_vk_job, job, &state->render_jobs, list) {
      list_del(&job->list);
      vc4_job_free(job);
   }

   list_for_each_entry_safe(struct vc4_vk_job, job, &state->render_jobs_done, list) {
      list_del(&job->list);
      vc4_job_free(job);
   }

   if (state->vc4 && state->vc4->job) {
      vc4_job_free(state->vc4->job);
      state->vc4->job = NULL;
   }

   /* free framebuffer */
   if (state->attachments) {
      vk_free(&cmd_buffer->device->vk.alloc, state->attachments);
      state->attachments = NULL;
   }

   vc4_context_free(cmd_buffer);
}

static VkResult
vc4_reset_cmd_buffer(struct vc4_cmd_buffer *cmd_buffer)
{
   if (cmd_buffer->status != VC4_CMD_BUFFER_STATUS_INVALID) {
      cmd_buffer_free_resources(cmd_buffer);
   }

   memset(&cmd_buffer->state, 0, sizeof(cmd_buffer->state));

   list_inithead(&cmd_buffer->state.render_jobs);
   list_inithead(&cmd_buffer->state.render_jobs_done);

   cmd_buffer->record_result = VK_SUCCESS;
   cmd_buffer->status = VC4_CMD_BUFFER_STATUS_INITIAL;

   return cmd_buffer->record_result;
}

static void
vc4_cmd_buffer_destroy(struct vc4_cmd_buffer *cmd_buffer)
{
   list_del(&cmd_buffer->pool_link);
   cmd_buffer_free_resources(cmd_buffer);
   vk_free(&cmd_buffer->pool->alloc, cmd_buffer);
}

VkResult
vc4_CreateCommandPool(VkDevice _device,
                     const VkCommandPoolCreateInfo *pCreateInfo,
                     const VkAllocationCallbacks *pAllocator,
                     VkCommandPool *pCmdPool)
{
   VC4_FROM_HANDLE(vc4_device, device, _device);
   struct vc4_cmd_pool *pool;

   pool = vk_alloc2(&device->vk.alloc, pAllocator, sizeof(*pool), 8,
                    VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
   if (pool == NULL)
      return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);

   if (pAllocator)
      pool->alloc = *pAllocator;
   else
      pool->alloc = device->vk.alloc;

   list_inithead(&pool->cmd_buffers);
   list_inithead(&pool->free_cmd_buffers);

   pool->queue_family_index = pCreateInfo->queueFamilyIndex;

   *pCmdPool = vc4_cmd_pool_to_handle(pool);

   return VK_SUCCESS;
}

void
vc4_DestroyCommandPool(VkDevice _device,
                      VkCommandPool commandPool,
                      const VkAllocationCallbacks *pAllocator)
{
   VC4_FROM_HANDLE(vc4_device, device, _device);
   VC4_FROM_HANDLE(vc4_cmd_pool, pool, commandPool);

   if (!pool)
      return;

   list_for_each_entry_safe(struct vc4_cmd_buffer, cmd_buffer,
                            &pool->cmd_buffers, pool_link)
   {
      vc4_cmd_buffer_destroy(cmd_buffer);
   }

   list_for_each_entry_safe(struct vc4_cmd_buffer, cmd_buffer,
                            &pool->free_cmd_buffers, pool_link)
   {
      vc4_cmd_buffer_destroy(cmd_buffer);
   }

   vk_free2(&device->vk.alloc, pAllocator, pool);
}

VkResult
vc4_AllocateCommandBuffers(VkDevice _device,
                          const VkCommandBufferAllocateInfo *pAllocateInfo,
                          VkCommandBuffer *pCommandBuffers)
{
   VC4_FROM_HANDLE(vc4_device, device, _device);
   VC4_FROM_HANDLE(vc4_cmd_pool, pool, pAllocateInfo->commandPool);

   VkResult result = VK_SUCCESS;
   uint32_t i;

   for (i = 0; i < pAllocateInfo->commandBufferCount; i++) {

      if (!list_is_empty(&pool->free_cmd_buffers)) {
         struct vc4_cmd_buffer *cmd_buffer = list_first_entry(
            &pool->free_cmd_buffers, struct vc4_cmd_buffer, pool_link);

         list_del(&cmd_buffer->pool_link);
         list_addtail(&cmd_buffer->pool_link, &pool->cmd_buffers);

         result = vc4_reset_cmd_buffer(cmd_buffer);
         cmd_buffer->level = pAllocateInfo->level;

         pCommandBuffers[i] = vc4_cmd_buffer_to_handle(cmd_buffer);
      } else {
         result = vc4_create_cmd_buffer(device, pool, pAllocateInfo->level,
                                       &pCommandBuffers[i]);
      }
      if (result != VK_SUCCESS)
         break;
   }

   if (result != VK_SUCCESS) {
      vc4_FreeCommandBuffers(_device, pAllocateInfo->commandPool, i,
                            pCommandBuffers);

      /* From the Vulkan 1.0.66 spec:
       *
       * "vkAllocateCommandBuffers can be used to create multiple
       *  command buffers. If the creation of any of those command
       *  buffers fails, the implementation must destroy all
       *  successfully created command buffer objects from this
       *  command, set all entries of the pCommandBuffers array to
       *  NULL and return the error."
       */
      memset(pCommandBuffers, 0,
             sizeof(*pCommandBuffers) * pAllocateInfo->commandBufferCount);
   }

   return result;
}

void
vc4_FreeCommandBuffers(VkDevice device,
                      VkCommandPool commandPool,
                      uint32_t commandBufferCount,
                      const VkCommandBuffer *pCommandBuffers)
{
   for (uint32_t i = 0; i < commandBufferCount; i++) {
      VC4_FROM_HANDLE(vc4_cmd_buffer, cmd_buffer, pCommandBuffers[i]);

      if (cmd_buffer) {
         if (cmd_buffer->pool) {
            list_del(&cmd_buffer->pool_link);
            list_addtail(&cmd_buffer->pool_link,
                         &cmd_buffer->pool->free_cmd_buffers);
         } else
            vc4_cmd_buffer_destroy(cmd_buffer);
      }
   }
}

VkResult
vc4_BeginCommandBuffer(VkCommandBuffer commandBuffer,
                        const VkCommandBufferBeginInfo *pBeginInfo)
{
   VC4_FROM_HANDLE(vc4_cmd_buffer, cmd_buffer, commandBuffer);
   VkResult result = VK_SUCCESS;

   if (cmd_buffer->status != VC4_CMD_BUFFER_STATUS_INITIAL) {
      /* If the command buffer has already been resetted with
       * vkResetCommandBuffer, no need to do it again.
       */
      result = vc4_reset_cmd_buffer(cmd_buffer);
      if (result != VK_SUCCESS)
         return result;
   }

   cmd_buffer->usage_flags = pBeginInfo->flags;

   /* setup initial configuration into command buffer */
   if (cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_PRIMARY) {

   } else if (cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_SECONDARY) {

   }

   cmd_buffer->status = VC4_CMD_BUFFER_STATUS_RECORDING;

   return VK_SUCCESS;
}

VkResult
vc4_EndCommandBuffer(VkCommandBuffer commandBuffer)
{
   VC4_FROM_HANDLE(vc4_cmd_buffer, cmd_buffer, commandBuffer);

   cmd_buffer->status = VC4_CMD_BUFFER_STATUS_EXECUTABLE;
   return cmd_buffer->record_result;
}

void
vc4_CmdBindDescriptorSets(VkCommandBuffer commandBuffer,
                          VkPipelineBindPoint pipelineBindPoint,
                          VkPipelineLayout _layout,
                          uint32_t firstSet,
                          uint32_t descriptorSetCount,
                          const VkDescriptorSet *pDescriptorSets,
                          uint32_t dynamicOffsetCount,
                          const uint32_t *pDynamicOffsets)
{
   VC4_FROM_HANDLE(vc4_cmd_buffer, cmd_buffer, commandBuffer);
   VC4_FROM_HANDLE(vc4_pipeline_layout, layout, _layout);

   uint32_t dyn_index = 0;

   assert(firstSet + descriptorSetCount <= MAX_SETS);

   struct vc4_descriptor_state *descriptor_state =
      &cmd_buffer->state.descriptor_state[pipelineBindPoint];

   bool descriptor_state_changed = false;
   for (uint32_t i = 0; i < descriptorSetCount; i++) {
      VC4_FROM_HANDLE(vc4_descriptor_set, set, pDescriptorSets[i]);
      uint32_t index = firstSet + i;

      if (descriptor_state->descriptor_sets[index] != set) {
         descriptor_state->descriptor_sets[index] = set;
         descriptor_state_changed = true;
      }

      if (!(descriptor_state->valid & (1u << index))) {
         descriptor_state->valid |= (1u << index);
         descriptor_state_changed = true;
      }

      for (uint32_t j = 0; j < set->layout->dynamic_offset_count; j++, dyn_index++) {
         uint32_t idx = j + layout->set[i + firstSet].dynamic_offset_start;

         if (descriptor_state->dynamic_offsets[idx] != pDynamicOffsets[dyn_index]) {
            descriptor_state->dynamic_offsets[idx] = pDynamicOffsets[dyn_index];
            descriptor_state_changed = true;
         }
      }
   }

   if (descriptor_state_changed) {
      if (pipelineBindPoint == VK_PIPELINE_BIND_POINT_GRAPHICS)
         cmd_buffer->state.dirty |= VC4_CMD_DIRTY_DESCRIPTOR_SETS;
      else
         cmd_buffer->state.dirty |= VC4_CMD_DIRTY_COMPUTE_DESCRIPTOR_SETS;
   }
}

static void
state_setup_attachments(struct vc4_attachment_state *attachments,
                        struct vc4_render_pass *pass,
                        const VkClearValue *clear_values)
{
   for (uint32_t i = 0; i < pass->attachment_count; ++i) {
      struct vc4_render_pass_attachment *att = &pass->attachments[i];
      VkImageAspectFlags att_aspects = vk_format_aspects(att->desc.format);
      VkImageAspectFlags clear_aspects = 0;

      if (att_aspects == VK_IMAGE_ASPECT_COLOR_BIT) {
         /* color attachment */
         if (att->desc.loadOp == VK_ATTACHMENT_LOAD_OP_CLEAR) {
            clear_aspects |= VK_IMAGE_ASPECT_COLOR_BIT;
         }
      } else {
         /* depthstencil attachment */
         if ((att_aspects & VK_IMAGE_ASPECT_DEPTH_BIT) &&
             att->desc.loadOp == VK_ATTACHMENT_LOAD_OP_CLEAR) {
            clear_aspects |= VK_IMAGE_ASPECT_DEPTH_BIT;
            if ((att_aspects & VK_IMAGE_ASPECT_STENCIL_BIT) &&
                att->desc.stencilLoadOp == VK_ATTACHMENT_LOAD_OP_DONT_CARE)
               clear_aspects |= VK_IMAGE_ASPECT_STENCIL_BIT;
         }
         if ((att_aspects & VK_IMAGE_ASPECT_STENCIL_BIT) &&
             att->desc.stencilLoadOp == VK_ATTACHMENT_LOAD_OP_CLEAR) {
            clear_aspects |= VK_IMAGE_ASPECT_STENCIL_BIT;
         }
      }

      attachments[i].pending_clear_aspects = clear_aspects;
      if (clear_values)
         attachments[i].clear_value = clear_values[i];
   }
}

void
vc4_CmdBeginRenderPass(VkCommandBuffer commandBuffer,
                        const VkRenderPassBeginInfo *pRenderPassBegin,
                        VkSubpassContents contents)
{
   VC4_FROM_HANDLE(vc4_cmd_buffer, cmd_buffer, commandBuffer);
   VC4_FROM_HANDLE(vc4_render_pass, pass, pRenderPassBegin->renderPass);
   VC4_FROM_HANDLE(vc4_framebuffer, framebuffer, pRenderPassBegin->framebuffer);

   struct vc4_cmd_buffer_state *state = &cmd_buffer->state;
   state->pass = pass;
   state->framebuffer = framebuffer;

   // cmd_buffer_ensure_render_pass_attachment_state(cmd_buffer);
   // vc4_return_if_oom(cmd_buffer, NULL);

   //TODO: need free
   state->attachments = vk_zalloc(&cmd_buffer->device->vk.alloc,
                        sizeof(struct vc4_attachment_state) * pass->attachment_count, 8,
                        VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
   if (!state->attachments) {
      vk_error(NULL, VK_ERROR_OUT_OF_HOST_MEMORY);
      return;
   }

   state_setup_attachments(state->attachments, pass, pRenderPassBegin->pClearValues);

   state->render_area = pRenderPassBegin->renderArea;

   /* If our render area is smaller than the current clip window we will have
    * to emit a new clip window to constraint it to the render area.
    */
   uint32_t min_render_x = state->render_area.offset.x;
   uint32_t min_render_y = state->render_area.offset.x;
   uint32_t max_render_x = min_render_x + state->render_area.extent.width - 1;
   uint32_t max_render_y = min_render_y + state->render_area.extent.height - 1;
   uint32_t min_clip_x = state->clip_window.offset.x;
   uint32_t min_clip_y = state->clip_window.offset.y;
   uint32_t max_clip_x = min_clip_x + state->clip_window.extent.width - 1;
   uint32_t max_clip_y = min_clip_y + state->clip_window.extent.height - 1;
   if (min_render_x > min_clip_x || min_render_y > min_clip_y ||
       max_render_x < max_clip_x || max_render_y < max_clip_y) {
      state->dirty |= VC4_CMD_DIRTY_SCISSOR;
   }

   /* Setup for first subpass */
   // vc4_cmd_buffer_subpass_start(cmd_buffer, 0);
}

void
vc4_CmdEndRenderPass(VkCommandBuffer commandBuffer)
{
   VC4_FROM_HANDLE(vc4_cmd_buffer, cmd_buffer, commandBuffer);

   /* Finalize last subpass */
   struct vc4_cmd_buffer_state *state = &cmd_buffer->state;
   assert(state->subpass_idx == state->pass->subpass_count - 1);
   // vc4_cmd_buffer_subpass_finish(cmd_buffer);
   // vc4_cmd_buffer_finish_job(cmd_buffer);

   // cmd_buffer_subpass_handle_pending_resolves(cmd_buffer);

   /* We are no longer inside a render pass */
   state->framebuffer = NULL;
   state->pass = NULL;
   state->subpass_idx = -1;
}

void
vc4_CmdBindVertexBuffers(VkCommandBuffer commandBuffer,
                          uint32_t firstBinding,
                          uint32_t bindingCount,
                          const VkBuffer *pBuffers,
                          const VkDeviceSize *pOffsets)
{
   VC4_FROM_HANDLE(vc4_cmd_buffer, cmd_buffer, commandBuffer);
   // struct vc4_vertex_binding *vb = cmd_buffer->state.vertex_bindings;

   struct vc4_vertexbuf_stateobj *vertexbuf = &cmd_buffer->state.vertexbuf;

   /* We have to defer setting up vertex buffer since we need the buffer
    * stride from the pipeline.
    */

   assert(firstBinding + bindingCount <= MAX_VBS);
   bool vb_state_changed = false;

   // for (uint32_t i = 0; i < bindingCount; i++) {
   //    if (vb[firstBinding + i].buffer != vc4_buffer_from_handle(pBuffers[i])) {
   //       vb[firstBinding + i].buffer = vc4_buffer_from_handle(pBuffers[i]);
   //       vb_state_changed = true;
   //    }
   //    if (vb[firstBinding + i].offset != pOffsets[i]) {
   //       vb[firstBinding + i].offset = pOffsets[i];
   //       vb_state_changed = true;
   //    }
   // }

   for (int i = 0; i < bindingCount; ++i) {
      if (vertexbuf->vb[firstBinding + i].buffer != vc4_buffer_from_handle(pBuffers[i])) {
         vertexbuf->vb[firstBinding + i].buffer = vc4_buffer_from_handle(pBuffers[i]);
         vb_state_changed = true;
      }

      if (vertexbuf->vb[firstBinding + i].buffer_offset != pOffsets[i]) {
         vertexbuf->vb[firstBinding + i].buffer_offset = pOffsets[i];
         vb_state_changed = true;
      }
   }

   if (vb_state_changed)
      cmd_buffer->state.dirty |= VC4_CMD_DIRTY_VERTEX_BUFFER;
}

/* This goes though the list of possible dynamic states in the pipeline and,
 * for those that are not configured as dynamic, copies relevant state into
 * the command buffer.
 */
static void
cmd_buffer_bind_pipeline_static_state(struct vc4_cmd_buffer *cmd_buffer,
                                      const struct vc4_dynamic_state *src)
{
   struct vc4_dynamic_state *dest = &cmd_buffer->state.dynamic;
   uint32_t dynamic_mask = src->mask;
   uint32_t dirty = 0;

   if (!(dynamic_mask & VC4_DYNAMIC_VIEWPORT)) {
      dest->viewport.count = src->viewport.count;
      if (memcmp(&dest->viewport.viewports, &src->viewport.viewports,
                 src->viewport.count * sizeof(VkViewport))) {
         typed_memcpy(dest->viewport.viewports,
                      src->viewport.viewports,
                      src->viewport.count);
         typed_memcpy(dest->viewport.scale, src->viewport.scale,
                      src->viewport.count);
         typed_memcpy(dest->viewport.translate, src->viewport.translate,
                      src->viewport.count);
         dirty |= VC4_CMD_DIRTY_VIEWPORT;
      }
   }

   if (!(dynamic_mask & VC4_DYNAMIC_SCISSOR)) {
      dest->scissor.count = src->scissor.count;
      if (memcmp(&dest->scissor.scissors, &src->scissor.scissors,
                 src->scissor.count * sizeof(VkRect2D))) {
         typed_memcpy(dest->scissor.scissors,
                      src->scissor.scissors, src->scissor.count);
         dirty |= VC4_CMD_DIRTY_SCISSOR;
      }
   }

   if (!(dynamic_mask & VC4_DYNAMIC_STENCIL_COMPARE_MASK)) {
      if (memcmp(&dest->stencil_compare_mask, &src->stencil_compare_mask,
                 sizeof(src->stencil_compare_mask))) {
         dest->stencil_compare_mask = src->stencil_compare_mask;
         dirty |= VC4_CMD_DIRTY_STENCIL_COMPARE_MASK;
      }
   }

   if (!(dynamic_mask & VC4_DYNAMIC_STENCIL_WRITE_MASK)) {
      if (memcmp(&dest->stencil_write_mask, &src->stencil_write_mask,
                 sizeof(src->stencil_write_mask))) {
         dest->stencil_write_mask = src->stencil_write_mask;
         dirty |= VC4_CMD_DIRTY_STENCIL_WRITE_MASK;
      }
   }

   if (!(dynamic_mask & VC4_DYNAMIC_STENCIL_REFERENCE)) {
      if (memcmp(&dest->stencil_reference, &src->stencil_reference,
                 sizeof(src->stencil_reference))) {
         dest->stencil_reference = src->stencil_reference;
         dirty |= VC4_CMD_DIRTY_STENCIL_REFERENCE;
      }
   }

   if (!(dynamic_mask & VC4_DYNAMIC_BLEND_CONSTANTS)) {
      if (memcmp(dest->blend_constants, src->blend_constants,
                 sizeof(src->blend_constants))) {
         memcpy(dest->blend_constants, src->blend_constants,
                sizeof(src->blend_constants));
         dirty |= VC4_CMD_DIRTY_BLEND_CONSTANTS;
      }
   }

   if (!(dynamic_mask & VC4_DYNAMIC_DEPTH_BIAS)) {
      if (memcmp(&dest->depth_bias, &src->depth_bias,
                 sizeof(src->depth_bias))) {
         memcpy(&dest->depth_bias, &src->depth_bias, sizeof(src->depth_bias));
         dirty |= VC4_CMD_DIRTY_DEPTH_BIAS;
      }
   }

   if (!(dynamic_mask & VC4_DYNAMIC_LINE_WIDTH)) {
      if (dest->line_width != src->line_width) {
         dest->line_width = src->line_width;
         dirty |= VC4_CMD_DIRTY_LINE_WIDTH;
      }
   }

   cmd_buffer->state.dynamic.mask = dynamic_mask;
   cmd_buffer->state.dirty |= dirty;
}

static void
bind_graphics_pipeline(struct vc4_cmd_buffer *cmd_buffer,
                       struct vc4_pipeline *pipeline)
{
   assert(pipeline && !(pipeline->active_stages & VK_SHADER_STAGE_COMPUTE_BIT));
   if (cmd_buffer->state.pipeline == pipeline)
      return;

   /* Enable always flush if we are blending to sRGB render targets. This
    * fixes test failures in:
    * dEQP-VK.pipeline.blend.format.r8g8b8a8_srgb.*
    *
    * FIXME: not sure why we need this. The tile buffer is always linear, with
    * conversion from/to sRGB happening on tile load/store operations. This
    * means that when we enable flushing the only difference is that we convert
    * to sRGB on the store after each draw call and we convert from sRGB on the
    * load before each draw call, but the blend happens in linear format in the
    * tile buffer anyway, which is the same scenario as if we didn't flush.
    */
   assert(pipeline->subpass);
   // if (pipeline->subpass->has_srgb_rt && pipeline->blend.enables) {
   //    assert(cmd_buffer->state.job);
   //    cmd_buffer->state.job->always_flush = true;
   //    perf_debug("flushing draw calls for subpass %d because bound pipeline "
   //               "uses sRGB blending\n", cmd_buffer->state.subpass_idx);
   // }

   cmd_buffer->state.pipeline = pipeline;

   for(uint32_t i = 0; i < MAX_VBS; ++i) {
      cmd_buffer->state.vertexbuf.vb[i].stride = pipeline->vb[i].stride ;
   }

   cmd_buffer_bind_pipeline_static_state(cmd_buffer, &pipeline->dynamic_state);

   cmd_buffer->state.dirty |= VC4_CMD_DIRTY_PIPELINE;
}

void
vc4_CmdBindPipeline(VkCommandBuffer commandBuffer,
                     VkPipelineBindPoint pipelineBindPoint,
                     VkPipeline _pipeline)
{
   VC4_FROM_HANDLE(vc4_cmd_buffer, cmd_buffer, commandBuffer);
   VC4_FROM_HANDLE(vc4_pipeline, pipeline, _pipeline);

   switch (pipelineBindPoint) {
   // case VK_PIPELINE_BIND_POINT_COMPUTE:
   //    bind_compute_pipeline(cmd_buffer, pipeline);
   //    break;

   case VK_PIPELINE_BIND_POINT_GRAPHICS:
      bind_graphics_pipeline(cmd_buffer, pipeline);
      break;

   default:
      assert(!"invalid bind point");
      break;
   }
}
/* create one new job */
static struct vc4_vk_job *
vc4_get_job(struct vc4_device *device,
            struct vc4_image *cbuf, struct vc4_image *zsbuf)
{
   struct vc4_vk_job *job;

   job = vc4_job_create(device);

   if (cbuf) {
      if (cbuf->samples > 1) {
         job->msaa = true;
         job->msaa_color_write = cbuf;
      } else {
         job->color_write = cbuf;
         job->color_read = cbuf;
      }
   }

   if (zsbuf) {
      if (zsbuf->samples > 1) {
         job->msaa = true;
         job->msaa_zs_write = zsbuf;
      } else {
         job->zs_write = zsbuf;
         job->zs_read = zsbuf;
      }
   }

   if (job->msaa) {
      job->tile_width = 32;
      job->tile_height = 32;
   } else {
      job->tile_width = 64;
      job->tile_height = 64;
   }

   return job;
}

static uint32_t
pack_rgba(enum pipe_format format, const float *rgba)
{
   union util_color uc;
   util_pack_color(rgba, format, &uc);
   if (util_format_get_blocksize(format) == 2)
      return uc.us;
   else
      return uc.ui[0];
}

static struct vc4_vk_job *vc4_get_job_for_fbo(struct vc4_cmd_buffer *cmd_buffer)
{
   struct vc4_vk_job *job;
   struct vc4_image *cbuf = NULL;
   struct vc4_image *zsbuf = NULL;

   uint32_t color_attachment;
   uint32_t ds_attachment;

   struct vc4_framebuffer *fb = cmd_buffer->state.framebuffer;
   struct vc4_render_pass *pass = cmd_buffer->state.pass;

   if (pass->subpass_count > 0 && pass->subpasses[0].color_count > 0) {
      color_attachment = pass->subpasses[0].color_attachments->attachment;
      cbuf = fb->attachments[color_attachment]->image;
   }

   ds_attachment = pass->subpasses[0].ds_attachment.attachment;
   if (pass->subpass_count > 0 && ds_attachment != VK_ATTACHMENT_UNUSED)
      zsbuf = fb->attachments[ds_attachment]->image;

   job = vc4_get_job(cmd_buffer->device, cbuf, zsbuf);

   /* new job need to be built */
   if (cmd_buffer->state.vc4)
      cmd_buffer->state.vc4->dirty = ~0;

   /* Attachments load/store operation only be did, when the renderpass begins */
   if (list_is_empty(&cmd_buffer->state.render_jobs)) {
      if (cbuf) {
         if (pass->attachment_count > 0 &&
             pass->attachments[color_attachment].desc.loadOp == VK_ATTACHMENT_LOAD_OP_CLEAR) {
            enum pipe_format format = vk_format_to_pipe_format(pass->attachments[color_attachment].desc.format);
            const float *rgba = cmd_buffer->state.attachments[color_attachment].clear_value.color.float32;
            job->clear_color[0] = job->clear_color[1] = pack_rgba(format, rgba);

            job->cleared |= PIPE_CLEAR_COLOR0;
         }
      }

      if (zsbuf) {
         if (pass->attachment_count > 0 &&
             pass->attachments[ds_attachment].desc.loadOp == VK_ATTACHMENT_LOAD_OP_CLEAR) {

            if(cmd_buffer->state.attachments[ds_attachment].pending_clear_aspects | VK_IMAGE_ASPECT_STENCIL_BIT) {
               enum pipe_format format = vk_format_to_pipe_format(pass->attachments[ds_attachment].desc.format);

               switch(format) {
               case PIPE_FORMAT_Z24_UNORM_S8_UINT:
               case PIPE_FORMAT_Z24X8_UNORM:
               case PIPE_FORMAT_S8_UINT_Z24_UNORM:
               case PIPE_FORMAT_X8Z24_UNORM: {
                  float depth = cmd_buffer->state.attachments[ds_attachment].clear_value.depthStencil.depth;
                  job->clear_depth = util_pack_z(PIPE_FORMAT_Z24_UNORM_S8_UINT, depth);
                  break;
               }

               default:
                  printf("The depth format don't support !!!\n");
                  assert(0);
                  break;
               }
            }

            if(cmd_buffer->state.attachments[ds_attachment].pending_clear_aspects | VK_IMAGE_ASPECT_DEPTH_BIT) {
               job->clear_stencil = cmd_buffer->state.attachments[ds_attachment].clear_value.depthStencil.stencil & 0xff;
            }

            job->cleared |= PIPE_CLEAR_DEPTHSTENCIL;
         }
      }
   }

   job->draw_tiles_x = DIV_ROUND_UP(fb->width,
                                    job->tile_width);
   job->draw_tiles_y = DIV_ROUND_UP(fb->height,
                                    job->tile_height);

   return job;
}

static void
vc4_hw_2116_workaround(struct vc4_cmd_buffer *cmd_buffer, struct vc4_vk_job *job, int vert_count)
{
   if (job->draw_calls_queued + vert_count / 65535 >= VC4_HW_2116_COUNT) {
      fprintf(stderr, "Flushing batch due to HW-2116 workaround "
                      "(too many draw calls per scene\n");
      abort();
   }
}

static void
vc4_get_draw_cl_space(struct vc4_vk_job *job, int vert_count)
{
        /* The SW-5891 workaround may cause us to emit multiple shader recs
         * and draw packets.
         */
        int num_draws = DIV_ROUND_UP(vert_count, 65535 - 2) + 1;

        /* Binner gets our packet state -- vc4_emit.c contents,
         * and the primitive itself.
         */
        cl_ensure_space(&job->bcl,
                        256 + (VC4_PACKET_GL_ARRAY_PRIMITIVE_SIZE +
                               VC4_PACKET_GL_SHADER_STATE_SIZE) * num_draws);

        /* Nothing for rcl -- that's covered by vc4_context.c */

        /* shader_rec gets up to 12 dwords of reloc handles plus a maximally
         * sized shader_rec (104 bytes base for 8 vattrs plus 32 bytes of
         * vattr stride).
         */
        cl_ensure_space(&job->shader_rec,
                        (12 * sizeof(uint32_t) + 104 + 8 * 32) * num_draws);

        /* Uniforms are covered by vc4_write_uniforms(). */

        /* There could be up to 16 textures per stage, plus misc other
         * pointers.
         */
        cl_ensure_space(&job->bo_handles, (2 * 16 + 20) * sizeof(uint32_t));
        cl_ensure_space(&job->bo_pointers,
                        (2 * 16 + 20) * sizeof(struct vc4_bo *));
}

/**
 * Does the initial bining command list setup for drawing to a given FBO.
 */
static void
vc4_start_draw(struct vc4_cmd_buffer *cmd_buffer, struct vc4_vk_job *job)
{
        struct vc4_framebuffer *fb = cmd_buffer->state.framebuffer;

        if (job->needs_flush)
                return;

        vc4_get_draw_cl_space(job, 0);

        cl_emit(&job->bcl, TILE_BINNING_MODE_CONFIGURATION, bin) {
                bin.width_in_tiles = job->draw_tiles_x;
                bin.height_in_tiles = job->draw_tiles_y;
                bin.multisample_mode_4x = job->msaa;
        }

        /* START_TILE_BINNING resets the statechange counters in the hardware,
         * which are what is used when a primitive is binned to a tile to
         * figure out what new state packets need to be written to that tile's
         * command list.
         */
        cl_emit(&job->bcl, START_TILE_BINNING, start);

        /* Reset the current compressed primitives format.  This gets modified
         * by VC4_PACKET_GL_INDEXED_PRIMITIVE and
         * VC4_PACKET_GL_ARRAY_PRIMITIVE, so it needs to be reset at the start
         * of every tile.
         */
        cl_emit(&job->bcl, PRIMITIVE_LIST_FORMAT, list) {
                list.data_type = _16_BIT_INDEX;
                list.primitive_type = TRIANGLES_LIST;
        }

        job->needs_flush = true;
        job->draw_width = fb->width;
        job->draw_height = fb->height;
}

static void
vc4_emit_gl_shader_state(struct vc4_context *vc4,
                         const struct vc4_draw_info *info,
                         uint32_t extra_index_bias)
{
        struct vc4_vk_job *job = vc4->job;
        /* VC4_DIRTY_VTXSTATE */
        struct vc4_vertex_stateobj *vtx = vc4->vtx;
        /* VC4_DIRTY_VTXBUF */
        struct vc4_vertexbuf_stateobj *vertexbuf = vc4->vertexbuf;

        /* The simulator throws a fit if VS or CS don't read an attribute, so
         * we emit a dummy read.
         */
        uint32_t num_elements_emit = MAX2(vtx->num_elements, 1);

        /* Emit the shader record. */
        cl_start_shader_reloc(&job->shader_rec, 3 + num_elements_emit);

        cl_emit(&job->shader_rec, SHADER_RECORD, rec) {
                rec.enable_clipping = true;

                /* VC4_DIRTY_COMPILED_FS */
                rec.fragment_shader_is_single_threaded =
                        !vc4->prog.fs->fs_threaded;

                /* VC4_DIRTY_PRIM_MODE | VC4_DIRTY_RASTERIZER */
                rec.point_size_included_in_shaded_vertex_data =
                         (info->mode == PIPE_PRIM_POINTS &&
                          vc4->rasterizer->base.point_size_per_vertex);

                /* VC4_DIRTY_COMPILED_FS */
                rec.fragment_shader_number_of_varyings =
                        vc4->prog.fs->num_inputs;
                rec.fragment_shader_code_address =
                        cl_address(vc4->prog.fs->bo, 0);

                rec.coordinate_shader_attribute_array_select_bits =
                         vc4->prog.cs->vattrs_live;
                rec.coordinate_shader_total_attributes_size =
                         vc4->prog.cs->vattr_offsets[8];
                rec.coordinate_shader_code_address =
                        cl_address(vc4->prog.cs->bo, 0);

                rec.vertex_shader_attribute_array_select_bits =
                         vc4->prog.vs->vattrs_live;
                rec.vertex_shader_total_attributes_size =
                         vc4->prog.vs->vattr_offsets[8];
                rec.vertex_shader_code_address =
                        cl_address(vc4->prog.vs->bo, 0);
        };

        uint32_t max_index = 0xffff;
        for (int i = 0; i < vtx->num_elements; i++) {
                struct pipe_vertex_element *elem = &vtx->pipe[i];
                struct vc4_vertex_buffer *vb =
                        &vertexbuf->vb[elem->vertex_buffer_index];
               //  struct vc4_resource *rsc = vc4_resource(vb->buffer.resource);
                struct vc4_buffer *rsc = vb->buffer;
                /* not vc4->dirty tracked: vc4->last_index_bias */
                uint32_t offset = (vb->buffer_offset +
                                   elem->src_offset +
                                   vb->stride * (info->index_bias +
                                                 extra_index_bias));
                uint32_t vb_size = rsc->bo->size - offset;
                uint32_t elem_size =
                        util_format_get_blocksize(elem->src_format);

                cl_emit(&job->shader_rec, ATTRIBUTE_RECORD, attr) {
                        attr.address = cl_address(rsc->bo, offset);
                        attr.number_of_bytes_minus_1 = elem_size - 1;
                        attr.stride = vb->stride;
                        attr.coordinate_shader_vpm_offset =
                                vc4->prog.cs->vattr_offsets[i];
                        attr.vertex_shader_vpm_offset =
                                vc4->prog.vs->vattr_offsets[i];
                }

                if (vb->stride > 0) {
                        max_index = MIN2(max_index,
                                         (vb_size - elem_size) / vb->stride);
                }
        }

        if (vtx->num_elements == 0) {
                assert(num_elements_emit == 1);
               //  struct vc4_bo *bo = vc4_bo_alloc(vc4->screen, 4096, "scratch VBO");
                struct vc4_bo *bo = vc4_vk_bo_alloc(vc4->device, 4096);

                cl_emit(&job->shader_rec, ATTRIBUTE_RECORD, attr) {
                        attr.address = cl_address(bo, 0);
                        attr.number_of_bytes_minus_1 = 16 - 1;
                        attr.stride = 0;
                        attr.coordinate_shader_vpm_offset = 0;
                        attr.vertex_shader_vpm_offset = 0;
                }

               //  vc4_bo_unreference(&bo);
               vc4_vk_bo_free(vc4->device, bo);
        }

        cl_emit(&job->bcl, GL_SHADER_STATE, shader_state) {
                /* Note that number of attributes == 0 in the packet means 8
                 * attributes.  This field also contains the offset into
                 * shader_rec.
                 */
                assert(vtx->num_elements <= 8);
                shader_state.number_of_attribute_arrays =
                        num_elements_emit & 0x7;
        }

        vc4_write_uniforms(vc4, vc4->prog.fs,
                           &vc4->constbuf,
                           &vc4->fragtex);
        vc4_write_uniforms(vc4, vc4->prog.vs,
                           &vc4->constbuf,
                           &vc4->verttex);
        vc4_write_uniforms(vc4, vc4->prog.cs,
                           &vc4->constbuf,
                           &vc4->verttex);

        vc4->last_index_bias = info->index_bias + extra_index_bias;
        vc4->max_index = max_index;
        job->shader_rec_count++;
}

static uint16_t
float_to_187_half(float f)
{
        return fui(f) >> 16;
}

static VkResult
vc4_create_blend_state(struct vc4_context *vc4, struct vc4_cmd_buffer *cmd_buffer)
{
   vc4->blend = &cmd_buffer->state.pipeline->blend_state;
	vc4->dirty |= VC4_DIRTY_BLEND;

	return VK_SUCCESS;
}

/**
 * The TLB_STENCIL_SETUP data has a little bitfield for common writemask
 * values, so you don't have to do a separate writemask setup.
 */
static uint8_t
tlb_stencil_setup_writemask(uint8_t mask)
{
        switch (mask) {
        case 0x1: return 0;
        case 0x3: return 1;
        case 0xf: return 2;
        case 0xff: return 3;
        default: return 0xff;
        }
}

static uint32_t
tlb_stencil_setup_bits(const struct pipe_stencil_state *state,
                       uint8_t writemask_bits)
{
        static const uint8_t op_map[] = {
                [PIPE_STENCIL_OP_ZERO] = 0,
                [PIPE_STENCIL_OP_KEEP] = 1,
                [PIPE_STENCIL_OP_REPLACE] = 2,
                [PIPE_STENCIL_OP_INCR] = 3,
                [PIPE_STENCIL_OP_DECR] = 4,
                [PIPE_STENCIL_OP_INVERT] = 5,
                [PIPE_STENCIL_OP_INCR_WRAP] = 6,
                [PIPE_STENCIL_OP_DECR_WRAP] = 7,
        };
        uint32_t bits = 0;

        if (writemask_bits != 0xff)
                bits |= writemask_bits << 28;
        bits |= op_map[state->zfail_op] << 25;
        bits |= op_map[state->zpass_op] << 22;
        bits |= op_map[state->fail_op] << 19;
        bits |= state->func << 16;
        /* Ref is filled in at uniform upload time */
        bits |= state->valuemask << 0;

        return bits;
}

static VkResult
vc4_create_depth_stencil_alpha_state(struct vc4_context *vc4, struct vc4_cmd_buffer *cmd_buffer)
{
	struct vc4_depth_stencil_alpha_state *zsa = vc4->zsa;

   if (!zsa) {
      zsa = vk_zalloc(&vc4->device->vk.alloc,
                      sizeof(struct vc4_depth_stencil_alpha_state), 8,
                      VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
      if (!zsa)
         return vk_error(NULL, VK_ERROR_OUT_OF_HOST_MEMORY);

      vc4->zsa = zsa;
   }

   /* base setup */
   zsa->base = cmd_buffer->state.pipeline->ds_state;

   if (zsa->base.stencil[0].enabled) {
      if (cmd_buffer->state.dirty & VC4_CMD_DIRTY_STENCIL_COMPARE_MASK) {
         zsa->base.stencil[0].valuemask = cmd_buffer->state.dynamic.stencil_compare_mask.front;
         zsa->base.stencil[1].valuemask = cmd_buffer->state.dynamic.stencil_compare_mask.back;
      }

      if (cmd_buffer->state.dirty & VC4_CMD_DIRTY_STENCIL_WRITE_MASK) {
         zsa->base.stencil[0].writemask = cmd_buffer->state.dynamic.stencil_write_mask.front;
         zsa->base.stencil[1].writemask = cmd_buffer->state.dynamic.stencil_write_mask.back;
      }

      if (cmd_buffer->state.dirty & VC4_CMD_DIRTY_STENCIL_REFERENCE) {
         vc4->stencil_ref.ref_value[0] = cmd_buffer->state.dynamic.stencil_reference.front;
         vc4->stencil_ref.ref_value[1] = cmd_buffer->state.dynamic.stencil_reference.back;
         vc4->dirty |= VC4_DIRTY_STENCIL_REF;
      }
   }

   /* We always keep the early Z state correct, since a later state using
    * early Z may want it.
    */
   zsa->config_bits[2] |= VC4_CONFIG_BITS_EARLY_Z_UPDATE;

   if (zsa->base.depth.enabled) {
      if (zsa->base.depth.writemask) {
         zsa->config_bits[1] |= VC4_CONFIG_BITS_Z_UPDATE;
      }
      zsa->config_bits[1] |= (zsa->base.depth.func <<
                              VC4_CONFIG_BITS_DEPTH_FUNC_SHIFT);

      /* We only handle early Z in the < direction because otherwise
      * we'd have to runtime guess which direction to set in the
      * render config.
      */
      if ((zsa->base.depth.func == PIPE_FUNC_LESS ||
         zsa->base.depth.func == PIPE_FUNC_LEQUAL) &&
         (!zsa->base.stencil[0].enabled ||
         (zsa->base.stencil[0].zfail_op == PIPE_STENCIL_OP_KEEP &&
            (!zsa->base.stencil[1].enabled ||
            zsa->base.stencil[1].zfail_op == PIPE_STENCIL_OP_KEEP)))) {
            zsa->config_bits[2] |= VC4_CONFIG_BITS_EARLY_Z;
      }
   } else {
      zsa->config_bits[1] |= (PIPE_FUNC_ALWAYS <<
                              VC4_CONFIG_BITS_DEPTH_FUNC_SHIFT);
   }

   if (zsa->base.stencil[0].enabled) {
      const struct pipe_stencil_state *front = &zsa->base.stencil[0];
      const struct pipe_stencil_state *back = &zsa->base.stencil[1];

      uint8_t front_writemask_bits =
            tlb_stencil_setup_writemask(front->writemask);
      uint8_t back_writemask = front->writemask;
      uint8_t back_writemask_bits = front_writemask_bits;

      zsa->stencil_uniforms[0] =
            tlb_stencil_setup_bits(front, front_writemask_bits);
      if (back->enabled) {
            back_writemask = back->writemask;
            back_writemask_bits =
                     tlb_stencil_setup_writemask(back->writemask);

            zsa->stencil_uniforms[0] |= (1 << 30);
            zsa->stencil_uniforms[1] =
                     tlb_stencil_setup_bits(back, back_writemask_bits);
            zsa->stencil_uniforms[1] |= (2 << 30);
      } else {
            zsa->stencil_uniforms[0] |= (3 << 30);
      }

      if (front_writemask_bits == 0xff ||
         back_writemask_bits == 0xff) {
            zsa->stencil_uniforms[2] = (front->writemask |
                                       (back_writemask << 8));
      }
   }

   //TODO:
	zsa->base.alpha.enabled = false;

	vc4->dirty |= VC4_DIRTY_ZSA;

	return VK_SUCCESS;
}

static VkResult
vc4_set_constant_buffer(struct vc4_context *vc4, struct vc4_cmd_buffer *cmd_buffer)
{
   struct vc4_constbuf_stateobj *so = &vc4->constbuf;

   struct vc4_descriptor_map *descriptor_map = &cmd_buffer->state.pipeline->ubo_map;
   struct vc4_descriptor_state *set_state = &cmd_buffer->state.descriptor_state[VK_PIPELINE_BIND_POINT_GRAPHICS];

   //push constant
   so->cb[0].user_buffer = cmd_buffer->push_constants_data;
   so->cb[0].buffer_size = MAX_PUSH_CONSTANT_SIZE;
   so->cb[0].buffer_offset = 0;

   if (descriptor_map->num_desc == 0)
      return VK_SUCCESS;

   /* UBO */
   for (int i = 1; (i < descriptor_map->num_desc + 1) && (i < PIPE_MAX_CONSTANT_BUFFERS); i++) {

      struct vc4_descriptor_set *set = set_state->descriptor_sets[descriptor_map->set[i]];
      const struct vc4_descriptor_set_binding_layout *binding_layout = &set->layout->binding[descriptor_map->binding[i]];

      struct vc4_descriptor *descriptor = set->descriptors;

      descriptor += binding_layout->descriptor_index;
      descriptor += descriptor_map->array_index[i];

      if(!descriptor->buffer->bo->map)
         vc4_bo_map(cmd_buffer->device, descriptor->buffer->bo);

      so->cb[i].user_buffer = descriptor->buffer->bo->map;
      so->cb[i].buffer_offset = descriptor->buffer->bo_offset + descriptor->offset;
      so->cb[i].buffer_size = descriptor->buffer->size;

      assert(so->cb[i].user_buffer);
   }

   return VK_SUCCESS;
}

static inline unsigned vk_conv_wrap_mode(enum VkSamplerAddressMode addr_mode)
{
   switch (addr_mode) {
   case VK_SAMPLER_ADDRESS_MODE_REPEAT:
      return PIPE_TEX_WRAP_REPEAT;
   case VK_SAMPLER_ADDRESS_MODE_MIRRORED_REPEAT:
      return PIPE_TEX_WRAP_MIRROR_REPEAT;
   case VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_EDGE:
      return PIPE_TEX_WRAP_CLAMP_TO_EDGE;
   case VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_BORDER:
      return PIPE_TEX_WRAP_CLAMP_TO_BORDER;
   case VK_SAMPLER_ADDRESS_MODE_MIRROR_CLAMP_TO_EDGE:
      return PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE;
   default:
      assert(0);
      return 0;
   }
}

static uint32_t translate_wrap(uint32_t p_wrap, bool using_nearest)
{
   switch (p_wrap) {
   case PIPE_TEX_WRAP_REPEAT:
      return 0;
   case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
      return 1;
   case PIPE_TEX_WRAP_MIRROR_REPEAT:
      return 2;
   case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
      return 3;
   case PIPE_TEX_WRAP_CLAMP:
      return (using_nearest ? 1 : 3);
   default:
      fprintf(stderr, "Unknown wrap mode %d\n", p_wrap);
      assert(!"not reached");
      return 0;
   }
}

static void fill_sampler(struct vc4_sampler_state *vc4_ss,
                         struct vc4_sampler *samp)
{
   static const uint8_t minfilter_map[6] = {
       VC4_TEX_P1_MINFILT_NEAR_MIP_NEAR,
       VC4_TEX_P1_MINFILT_LIN_MIP_NEAR,
       VC4_TEX_P1_MINFILT_NEAR_MIP_LIN,
       VC4_TEX_P1_MINFILT_LIN_MIP_LIN,
       VC4_TEX_P1_MINFILT_NEAREST,
       VC4_TEX_P1_MINFILT_LINEAR,
   };
   static const uint32_t magfilter_map[] = {
       [PIPE_TEX_FILTER_NEAREST] = VC4_TEX_P1_MAGFILT_NEAREST,
       [PIPE_TEX_FILTER_LINEAR] = VC4_TEX_P1_MAGFILT_LINEAR,
   };

   // init struct vc4_sampler_stat: base
   struct pipe_sampler_state *ss = &vc4_ss->base;

   // copy from val_execute.c
   ss->wrap_s = vk_conv_wrap_mode(samp->create_info.addressModeU);
   ss->wrap_t = vk_conv_wrap_mode(samp->create_info.addressModeV);
   ss->wrap_r = vk_conv_wrap_mode(samp->create_info.addressModeW);
   ss->min_img_filter = samp->create_info.minFilter == VK_FILTER_LINEAR ? PIPE_TEX_FILTER_LINEAR : PIPE_TEX_FILTER_NEAREST;
   ss->min_mip_filter = samp->create_info.mipmapMode == VK_SAMPLER_MIPMAP_MODE_LINEAR ? PIPE_TEX_MIPFILTER_LINEAR : PIPE_TEX_MIPFILTER_NEAREST;
   ss->mag_img_filter = samp->create_info.magFilter == VK_FILTER_LINEAR ? PIPE_TEX_FILTER_LINEAR : PIPE_TEX_FILTER_NEAREST;
   ss->min_lod = samp->create_info.minLod;
   ss->max_lod = samp->create_info.maxLod;
   ss->lod_bias = samp->create_info.mipLodBias;
   ss->max_anisotropy = samp->create_info.maxAnisotropy;
   ss->normalized_coords = !samp->create_info.unnormalizedCoordinates;
   ss->compare_mode = samp->create_info.compareEnable ? PIPE_TEX_COMPARE_R_TO_TEXTURE : PIPE_TEX_COMPARE_NONE;
   ss->compare_func = samp->create_info.compareOp;
   ss->seamless_cube_map = true;

   switch (samp->create_info.borderColor) {
   case VK_BORDER_COLOR_FLOAT_TRANSPARENT_BLACK:
   case VK_BORDER_COLOR_INT_TRANSPARENT_BLACK:
   default:
      memset(ss->border_color.f, 0, 4 * sizeof(float));
      break;
   case VK_BORDER_COLOR_FLOAT_OPAQUE_BLACK:
      ss->border_color.f[0] = ss->border_color.f[1] = ss->border_color.f[2] = 0.0f;
      ss->border_color.f[3] = 1.0f;
      break;
   case VK_BORDER_COLOR_INT_OPAQUE_BLACK:
      ss->border_color.i[0] = ss->border_color.i[1] = ss->border_color.i[2] = 0;
      ss->border_color.i[3] = 1;
      break;
   case VK_BORDER_COLOR_FLOAT_OPAQUE_WHITE:
      ss->border_color.f[0] = ss->border_color.f[1] = ss->border_color.f[2] = 1.0f;
      ss->border_color.f[3] = 1.0f;
      break;
   case VK_BORDER_COLOR_INT_OPAQUE_WHITE:
      ss->border_color.i[0] = ss->border_color.i[1] = ss->border_color.i[2] = 1;
      ss->border_color.i[3] = 1;
      break;
   }

   bool either_nearest = (ss->mag_img_filter == PIPE_TEX_MIPFILTER_NEAREST ||
                          ss->min_img_filter == PIPE_TEX_MIPFILTER_NEAREST);

   vc4_ss->texture_p1 = (VC4_SET_FIELD(magfilter_map[ss->mag_img_filter],
                                       VC4_TEX_P1_MAGFILT) |
                         VC4_SET_FIELD(minfilter_map[ss->min_mip_filter * 2 +
                                                     ss->min_img_filter],
                                       VC4_TEX_P1_MINFILT) |
                         VC4_SET_FIELD(translate_wrap(ss->wrap_s, either_nearest),
                                       VC4_TEX_P1_WRAP_S) |
                         VC4_SET_FIELD(translate_wrap(ss->wrap_t, either_nearest),
                                       VC4_TEX_P1_WRAP_T));
}

static VkResult
vc4_create_sampler_state(struct vc4_context *vc4, struct vc4_cmd_buffer *cmd_buffer)
{
   struct vc4_descriptor_map *sample_map = &cmd_buffer->state.pipeline->sampler_map;
   struct vc4_descriptor_state *set_state = &cmd_buffer->state.descriptor_state[VK_PIPELINE_BIND_POINT_GRAPHICS];

   struct vc4_sampler_state *ss = vc4->fragtex.samplers[0];

   if (ss && vc4->fragtex.num_samplers < sample_map->num_desc) {
      vk_free(&cmd_buffer->device->vk.alloc, ss);
      ss = NULL;
   }

   vc4->fragtex.num_samplers = sample_map->num_desc;

   if (sample_map->num_desc == 0)
      return VK_SUCCESS;

   if (!ss) {
      ss = vk_zalloc(&cmd_buffer->device->vk.alloc,
                     sizeof(struct vc4_sampler_state) * sample_map->num_desc, 8,
                     VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
      if (!ss)
         return vk_error(NULL, VK_ERROR_OUT_OF_HOST_MEMORY);
   }

   for (int i = 0; i < sample_map->num_desc; i++) {
      struct vc4_descriptor_set *set = set_state->descriptor_sets[sample_map->set[i]];
      const struct vc4_descriptor_set_binding_layout *binding_layout = &set->layout->binding[sample_map->binding[i]];

      struct vc4_descriptor *descriptor = set->descriptors;

      descriptor += binding_layout->descriptor_index;
      descriptor += sample_map->array_index[i];

      struct vc4_sampler *sampler = descriptor->sampler;
      fill_sampler(&ss[i], sampler);

      vc4->fragtex.samplers[i] = &ss[i];
   }

   return VK_SUCCESS;
}

static VkResult
vc4_create_sampler_view(struct vc4_context *vc4, struct vc4_cmd_buffer *cmd_buffer)
{
   struct vc4_descriptor_map *texture_map = &cmd_buffer->state.pipeline->texture_map;
   struct vc4_descriptor_state *set_state = &cmd_buffer->state.descriptor_state[VK_PIPELINE_BIND_POINT_GRAPHICS];

   struct vc4_sampler_view *so = vc4->fragtex.textures[0];

   if (so && vc4->fragtex.num_textures < texture_map->num_desc) {
      vk_free(&cmd_buffer->device->vk.alloc, so);
      so = NULL;
   }

   vc4->fragtex.num_textures = texture_map->num_desc;

   if (texture_map->num_desc == 0)
      return VK_SUCCESS;

   if (!so) {
      //TODO: need free memory
      so = vk_zalloc(&cmd_buffer->device->vk.alloc,
                     sizeof(struct vc4_sampler_view) * texture_map->num_desc, 8,
                     VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
      if (!so)
         return vk_error(NULL, VK_ERROR_OUT_OF_HOST_MEMORY);
   }

   for (int i = 0; i < texture_map->num_desc; i++) {

      struct vc4_descriptor_set *set = set_state->descriptor_sets[texture_map->set[i]];
      const struct vc4_descriptor_set_binding_layout *binding_layout = &set->layout->binding[texture_map->binding[i]];

      struct vc4_descriptor *descriptor = set->descriptors;

      descriptor += binding_layout->descriptor_index;
      descriptor += texture_map->array_index[i];

      struct vc4_image_view *iview = descriptor->image_view;
      const struct vc4_image *image = iview->image;

      //init struct pipe_sampler_view: base
      so[i].base.format = vk_format_to_pipe_format(iview->vk_format);
      so[i].base.target = vc4_vk_type_to_pipe_type(iview->type);

      so[i].base.swizzle_r = iview->swizzle[0];
      so[i].base.swizzle_g = iview->swizzle[1];
      so[i].base.swizzle_b = iview->swizzle[2];
      so[i].base.swizzle_a = iview->swizzle[3];

      so[i].base.u.tex.first_layer = iview->first_layer;
      so[i].base.u.tex.last_layer = iview->last_layer;

      so[i].base.u.tex.first_level = iview->base_level;
      so[i].base.u.tex.last_level = iview->max_level;

      so[i].base.texture = NULL;
      so[i].base.context = NULL;

      so[i].image = image;

      if (so[i].base.u.tex.first_layer) {
         so[i].force_first_level = true;
      }

      so[i].texture_p0 = (VC4_SET_FIELD((image->slices[0].offset + so[i].base.u.tex.first_layer * image->cube_map_stride) >> 12,
                                        VC4_TEX_P0_OFFSET) |
                          VC4_SET_FIELD(vc4_get_tex_format(image->vk_format) & 15,
                                        VC4_TEX_P0_TYPE) |
                          VC4_SET_FIELD(so[i].force_first_level ? so[i].base.u.tex.last_level : so[i].base.u.tex.last_level - so[i].base.u.tex.first_level,
                                        VC4_TEX_P0_MIPLVLS) |
                          VC4_SET_FIELD(so[i].base.target == PIPE_TEXTURE_CUBE, VC4_TEX_P0_CMMODE));

      so[i].texture_p1 = (VC4_SET_FIELD(vc4_get_tex_format(image->vk_format) >> 4, VC4_TEX_P1_TYPE4) |
                          VC4_SET_FIELD(image->extent.height & 2047, VC4_TEX_P1_HEIGHT) |
                          VC4_SET_FIELD(image->extent.width & 2047, VC4_TEX_P1_WIDTH));

      if (vk_format_to_pipe_format(image->vk_format) == PIPE_FORMAT_ETC1_RGB8)
         so[i].texture_p1 |= VC4_TEX_P1_ETCFLIP_MASK;

      vc4->fragtex.textures[i] = &so[i];
   }

   return VK_SUCCESS;
}

static VkResult
vc4_raster_config(struct vc4_context *vc4, struct vc4_cmd_buffer *cmd_buffer)
{
   struct vc4_cmd_buffer_state *state = &cmd_buffer->state;

   struct V3D21_DEPTH_OFFSET depth_offset = {V3D21_DEPTH_OFFSET_header};
   struct V3D21_POINT_SIZE point_size = {V3D21_POINT_SIZE_header};
   struct V3D21_LINE_WIDTH line_width = {V3D21_LINE_WIDTH_header};

   struct vc4_rasterizer_state *so = vc4->rasterizer;

   if (!so) {
      //TODO: need free
      so = vk_zalloc(&cmd_buffer->device->vk.alloc,
                     sizeof(struct vc4_rasterizer_state), 8,
                     VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
      if (!so) {
         vk_free(&cmd_buffer->device->vk.alloc, vc4);
         return vk_error(NULL, VK_ERROR_OUT_OF_HOST_MEMORY);
      }

      vc4->rasterizer = so;
   }

   // I'm not sure why '!'
   if (!(state->pipeline->cullMode & VK_CULL_MODE_FRONT_BIT))
      so->config_bits[0] |= VC4_CONFIG_BITS_ENABLE_PRIM_FRONT;

   if (!(state->pipeline->cullMode & VK_CULL_MODE_BACK_BIT))
      so->config_bits[0] |= VC4_CONFIG_BITS_ENABLE_PRIM_BACK;

   /* Workaround: HW-2726 PTB does not handle zero-size points (BCM2835,
   * BCM21553).
   */
   //TODO:
   point_size.point_size = MAX2(0, .125f);

   line_width.line_width = state->dynamic.line_width;

   if (state->pipeline->frontFace == VK_FRONT_FACE_COUNTER_CLOCKWISE)
      so->config_bits[0] |= VC4_CONFIG_BITS_CW_PRIMITIVES;

   if (state->dynamic.depth_bias.depthBiasEnable) {
      so->config_bits[0] |= VC4_CONFIG_BITS_ENABLE_DEPTH_OFFSET;

      depth_offset.depth_offset_units =
          float_to_187_half(state->dynamic.depth_bias.constant_factor);
      depth_offset.depth_offset_factor =
          float_to_187_half(state->dynamic.depth_bias.slope_factor);
   }

   if (state->pipeline->rasterizationSamples == VK_SAMPLE_COUNT_4_BIT)
      so->config_bits[0] |= VC4_CONFIG_BITS_RASTERIZER_OVERSAMPLE_4X;

   V3D21_DEPTH_OFFSET_pack(NULL, so->packed.depth_offset, &depth_offset);
   V3D21_POINT_SIZE_pack(NULL, so->packed.point_size, &point_size);
   V3D21_LINE_WIDTH_pack(NULL, so->packed.line_width, &line_width);

   vc4->dirty |= VC4_DIRTY_FLAT_SHADE_FLAGS;
   vc4->dirty |= VC4_DIRTY_RASTERIZER;

   return VK_SUCCESS;
}

static
void vc4_context_free(struct vc4_cmd_buffer *cmd_buffer)
{
   struct vc4_context *vc4 = cmd_buffer->state.vc4;
   if (!vc4)
      return;

   if (vc4->rasterizer)
      vk_free(&cmd_buffer->device->vk.alloc, vc4->rasterizer);

   if (vc4->fragtex.textures[0])
      vk_free(&cmd_buffer->device->vk.alloc, vc4->fragtex.textures[0]);

   if (vc4->fragtex.samplers[0])
      vk_free(&cmd_buffer->device->vk.alloc, vc4->fragtex.samplers[0]);

   if (vc4->zsa)
      vk_free(&cmd_buffer->device->vk.alloc, vc4->zsa);

   vc4_program_fini(vc4);

   vk_free(&cmd_buffer->device->vk.alloc, vc4);
   cmd_buffer->state.vc4 = NULL;
}

static VkResult
vc4_vulkan_to_context(struct vc4_cmd_buffer *cmd_buffer, struct vc4_vk_job *job)
{
   struct vc4_cmd_buffer_state *state = &cmd_buffer->state;
   struct vc4_context *vc4 = state->vc4;

   if (!vc4) {
      //TODO: need free
      vc4 = vk_zalloc(&cmd_buffer->device->vk.alloc,
                      sizeof(struct vc4_context), 8,
                      VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
      if (!vc4)
         return vk_error(NULL, VK_ERROR_OUT_OF_HOST_MEMORY);

      state->vc4 = vc4;
      vc4_program_init(vc4);

      cmd_buffer->state.dirty = ~0;
   }

   //------------------------------------------------------------------------
   vc4->device = cmd_buffer->device;
   vc4->job = job;

   //------------------------------------------------------------------------
   //vs,fs
   if (cmd_buffer->state.dirty & VC4_CMD_DIRTY_PIPELINE) {
      vc4->prog.bind_fs.base.type = PIPE_SHADER_IR_NIR;
      vc4->prog.bind_fs.base.ir.nir = state->pipeline->fs->nir;
      vc4->dirty |= VC4_DIRTY_UNCOMPILED_FS;

      vc4->prog.bind_vs.base.type = PIPE_SHADER_IR_NIR;
      vc4->prog.bind_vs.base.ir.nir = state->pipeline->vs->nir;
      vc4->dirty |= VC4_DIRTY_UNCOMPILED_VS;
   }

   //------------------------------------------------------------------------
   //framebuffer
   // vc4->framebuffer.nr_cbufs = state->pass->subpasses[0].color_count;
   if (job->color_write)
      vc4->framebuffer.cbufs[0].format = vk_format_to_pipe_format(job->color_write->vk_format);
   else
      vc4->framebuffer.cbufs[0].format = PIPE_FORMAT_NONE;

   if (job->zs_write)
      vc4->framebuffer.zsbuf.format = vk_format_to_pipe_format(job->zs_write->vk_format);
   else
      vc4->framebuffer.zsbuf.format = PIPE_FORMAT_NONE;

   vc4->dirty |= VC4_DIRTY_FRAMEBUFFER;

   //------------------------------------------------------------------------
   //scissor
   if (cmd_buffer->state.dirty & VC4_CMD_DIRTY_SCISSOR) {
      vc4->scissor.minx = state->dynamic.scissor.scissors[0].offset.x;
      vc4->scissor.miny = state->dynamic.scissor.scissors[0].offset.y;
      vc4->scissor.maxx = state->dynamic.scissor.scissors[0].extent.width;
      vc4->scissor.maxy = state->dynamic.scissor.scissors[0].extent.height;
      vc4->dirty |= VC4_DIRTY_SCISSOR;
   }

   //viewport
   if (cmd_buffer->state.dirty & VC4_CMD_DIRTY_VIEWPORT) {
      vc4->viewport.scale[0] = state->dynamic.viewport.scale[0][0];
      vc4->viewport.scale[1] = state->dynamic.viewport.scale[0][1];
      vc4->viewport.scale[2] = state->dynamic.viewport.scale[0][2];
      vc4->viewport.translate[0] = state->dynamic.viewport.translate[0][0];
      vc4->viewport.translate[1] = state->dynamic.viewport.translate[0][1];
      vc4->viewport.translate[2] = state->dynamic.viewport.translate[0][2];
      vc4->dirty |= VC4_DIRTY_VIEWPORT;
   }

   //------------------------------------------------------------------------
   //rasterizer, need free
   if (cmd_buffer->state.dirty &
       (VC4_CMD_DIRTY_PIPELINE |
        VC4_CMD_DIRTY_DEPTH_BIAS |
        VC4_CMD_DIRTY_LINE_WIDTH)) {
      vc4_raster_config(vc4, cmd_buffer);
   }

   //------------------------------------------------------------------------
   //vtx
   if (cmd_buffer->state.dirty & VC4_CMD_DIRTY_VERTEX_BUFFER) {
      vc4->vtx = &cmd_buffer->state.pipeline->vtx;
      vc4->vertexbuf = &cmd_buffer->state.vertexbuf;
      vc4->dirty |= VC4_DIRTY_VTXSTATE | VC4_DIRTY_VTXBUF;
   }
   //------------------------------------------------------------------------
   //UBO
   if (cmd_buffer->state.dirty &
       (VC4_CMD_DIRTY_DESCRIPTOR_SETS |
        VC4_CMD_DIRTY_PUSH_CONSTANTS)) {
      vc4_set_constant_buffer(vc4, cmd_buffer);
   }

   //------------------------------------------------------------------------
   //texture / sampler
   if (cmd_buffer->state.dirty & VC4_CMD_DIRTY_DESCRIPTOR_SETS) {
      vc4_create_sampler_view(vc4, cmd_buffer);
      vc4_create_sampler_state(vc4, cmd_buffer);
   }

   //----- not supported by now -------------------------------------------------------------------
   //color blend
   if (cmd_buffer->state.dirty & VC4_CMD_DIRTY_PIPELINE) {
      vc4_create_blend_state(vc4, cmd_buffer);
   }

   //depth/stencl test
   if (cmd_buffer->state.dirty & (VC4_CMD_DIRTY_PIPELINE |
                                  VC4_CMD_DIRTY_STENCIL_REFERENCE |
                                  VC4_CMD_DIRTY_STENCIL_WRITE_MASK|
                                  VC4_CMD_DIRTY_STENCIL_COMPARE_MASK)) {
      vc4_create_depth_stencil_alpha_state(vc4, cmd_buffer);
   }

   //vertex texture
   vc4->verttex.num_samplers = 0;
   vc4->verttex.num_textures = 0;

   // vc4->fragtex.num_samplers = 0;
   // vc4->fragtex.num_textures = 0;
   //------------------------------------------------------------------------

   return VK_SUCCESS;
}

static struct vc4_buffer *
vc4_get_shadow_index_buffer(struct vc4_context *vc4,
                            const struct vc4_draw_info *info,
                            uint32_t offset,
                            uint32_t count,
                            uint32_t *shadow_offset)
{
   //TODO: need free
   struct vc4_buffer *buffer = vk_zalloc(&vc4->device->vk.alloc, sizeof(*buffer), 8,
                                         VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
   if (buffer == NULL) {
      vk_error(vc4->device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
      abort();
      // return NULL;
   }

   buffer->size = count * 2;
   buffer->usage = VK_IMAGE_USAGE_TRANSFER_DST_BIT;
   buffer->flags = 0;
   buffer->bo_offset = 0;

   //TODO: need free
   buffer->bo = vc4_vk_bo_alloc(vc4->device, buffer->size);
   if (!buffer->bo) {
      vk_error(vc4->device->instance, VK_ERROR_OUT_OF_HOST_MEMORY)
      vk_free(&vc4->device->vk.alloc, buffer);
      abort();
      // return NULL;
   }

   shadow_offset = 0;
   vc4_bo_map(vc4->device, buffer->bo);
   uint16_t *dst = buffer->bo->map;

   vc4_bo_map(vc4->device, info->index.resource->bo);
   const uint32_t *src = info->index.resource->bo->map + offset;

   for (int i = 0; i < count; i++) {
      uint32_t src_index = src[i];
      if (src_index > 0xffff)
         abort();
      dst[i] = src_index;
   }

   vc4_bo_unmap(vc4->device, info->index.resource->bo);

   vc4->index_shadow = buffer;

   return buffer;
}

static void
cmd_buffer_draw(struct vc4_cmd_buffer *cmd_buffer,
                struct vc4_draw_info *info)
{
   struct vc4_vk_job *job;
   struct vc4_context *vc4;

   job = vc4_get_job_for_fbo(cmd_buffer);

   vc4_hw_2116_workaround(cmd_buffer, job, info->vertex_count);

   vc4_vulkan_to_context(cmd_buffer, job);
   vc4 = cmd_buffer->state.vc4;

   vc4_get_draw_cl_space(job, info->vertex_count);

   vc4->dirty |= VC4_DIRTY_PRIM_MODE;

   vc4_start_draw(cmd_buffer, job);
   if (!vc4_update_compiled_shaders(vc4, info->mode)) {
      debug_warn_once("shader compile failed, skipping draw call.\n");
      return;
   }

   vc4_emit_state(vc4);

   bool needs_drawarrays_shader_state = false;

   if ((vc4->dirty & (VC4_DIRTY_VTXBUF |
                     VC4_DIRTY_VTXSTATE |
                     VC4_DIRTY_PRIM_MODE |
                     VC4_DIRTY_RASTERIZER |
                     VC4_DIRTY_COMPILED_CS |
                     VC4_DIRTY_COMPILED_VS |
                     VC4_DIRTY_COMPILED_FS |
                     vc4->prog.cs->uniform_dirty_bits |
                     vc4->prog.vs->uniform_dirty_bits |
                     vc4->prog.fs->uniform_dirty_bits)) ||
      vc4->last_index_bias != info->index_bias) {
         if (info->vertex_count)
               vc4_emit_gl_shader_state(vc4, info, 0);
         else
               needs_drawarrays_shader_state = true;
   }

   vc4->dirty = 0;

   /* Note that the primitive type fields match with OpenGL/gallium
   * definitions, up to but not including QUADS.
   */
   if (info->index_size) {
            uint32_t index_size = info->index_size;
            uint32_t offset = info->start * index_size + info->index.idx_offset;
            struct vc4_buffer *prsc;

            if (info->index_size == 4) {
               prsc = vc4_get_shadow_index_buffer(vc4, info,
                                                  offset,
                                                  info->vertex_count, &offset);
               index_size = 2;
            }
            else {
               prsc = info->index.resource;
            }

            struct vc4_buffer *rsc = prsc;

            struct vc4_cl_out *bcl = cl_start(&job->bcl);

            /* The original design for the VC4 kernel UABI had multiple
            * packets that used relocations in the BCL (some of which
            * needed two BOs), but later modifications eliminated all but
            * this one usage.  We have an arbitrary 32-bit offset value,
            * and need to also supply an arbitrary 32-bit index buffer
            * GEM handle, so we have this fake packet we emit in our BCL
            * to be validated, which the kernel uses at validation time
            * to perform the relocation in the IB packet (without
            * emitting to the actual HW).
            */
            uint32_t hindex = vc4_gem_hindex(job, rsc->bo);
            if (job->last_gem_handle_hindex != hindex) {
                  cl_u8(&bcl, VC4_PACKET_GEM_HANDLES);
                  cl_u32(&bcl, hindex);
                  cl_u32(&bcl, 0);
                  job->last_gem_handle_hindex = hindex;
            }

            cl_u8(&bcl, VC4_PACKET_GL_INDEXED_PRIMITIVE);
            cl_u8(&bcl,
                  info->mode |
                  (index_size == 2 ?
                  VC4_INDEX_BUFFER_U16:
                  VC4_INDEX_BUFFER_U8));
            cl_u32(&bcl, info->vertex_count);
            cl_u32(&bcl, offset);
            cl_u32(&bcl, vc4->max_index);

            cl_end(&job->bcl, bcl);
            job->draw_calls_queued++;
   } else {
            uint32_t count = info->vertex_count;
            uint32_t start = info->start;
            uint32_t extra_index_bias = 0;
            static const uint32_t max_verts = 65535;

            /* GFXH-515 / SW-5891: The binner emits 16 bit indices for
            * drawarrays, which means that if start + count > 64k it
            * would truncate the top bits.  Work around this by emitting
            * a limited number of primitives at a time and reemitting the
            * shader state pointing farther down the vertex attribute
            * arrays.
            *
            * To do this properly for line loops or trifans, we'd need to
            * make a new VB containing the first vertex plus whatever
            * remainder.
            */
            if (start + count > max_verts) {
                  extra_index_bias = start;
                  start = 0;
                  needs_drawarrays_shader_state = true;
            }

            while (count) {
                  uint32_t this_count = count;
                  uint32_t step;

                  if (needs_drawarrays_shader_state) {
                           vc4_emit_gl_shader_state(vc4, info,
                                                   extra_index_bias);
                  }

                  u_split_draw(vk_to_pipe_prim_type[info->mode], max_verts, &this_count, &step);

                  cl_emit(&job->bcl, VERTEX_ARRAY_PRIMITIVES, array) {
                           array.primitive_mode = info->mode;
                           array.length = this_count;
                           array.index_of_first_vertex = start;
                  }
                  job->draw_calls_queued++;

                  count -= step;
                  extra_index_bias += start + step;
                  start = 0;
                  needs_drawarrays_shader_state = true;
            }
   }

   if (vc4->zsa && (vc4->framebuffer.zsbuf.format != PIPE_FORMAT_NONE)) {
         if (vc4->zsa->base.depth.enabled) {
               job->resolve |= PIPE_CLEAR_DEPTH;
         }

         if (vc4->zsa->base.stencil[0].enabled) {
               job->resolve |= PIPE_CLEAR_STENCIL;
         }
   }

   job->resolve |= PIPE_CLEAR_COLOR0;

   vc4_vk_build_submit_job(job);
   /* submit one job into render jobs list */
   vc4->job = NULL;
   list_addtail(&job->list, &cmd_buffer->state.render_jobs);
   /* clear cmd buffer dirty */
   cmd_buffer->state.dirty = 0;
}

void
vc4_CmdDraw(VkCommandBuffer commandBuffer,
             uint32_t vertexCount,
             uint32_t instanceCount,
             uint32_t firstVertex,
             uint32_t firstInstance)
{
   VC4_FROM_HANDLE(vc4_cmd_buffer, cmd_buffer, commandBuffer);

   struct vc4_draw_info info = {};

   info.vertex_count = vertexCount;
   info.instance_count = instanceCount;
   info.first_instance = firstInstance;
   info.start = firstVertex;

   info.mode = vk_to_pipe_prim_type[cmd_buffer->state.pipeline->topology];

   cmd_buffer_draw(cmd_buffer, &info);
}

void
vc4_CmdDrawIndexed(VkCommandBuffer commandBuffer,
                    uint32_t indexCount,
                    uint32_t instanceCount,
                    uint32_t firstIndex,
                    int32_t vertexOffset,
                    uint32_t firstInstance)
{
   VC4_FROM_HANDLE(vc4_cmd_buffer, cmd_buffer, commandBuffer);

   if (indexCount > 0xffff) {
      printf("The max index count vaule that vc4 support is 2^16," \
             "but current indexCount = %d, greater than it, so just try to set indexCount = 0xffff for fun !!!\n", indexCount);
      indexCount = 0xffff;
   }

   assert(indexCount <= 0xffff);

   struct vc4_draw_info info = {};

   info.start = firstIndex;
   info.vertex_count = indexCount;

   info.instance_count = instanceCount;
   info.first_instance = firstInstance;

   info.index_size = cmd_buffer->state.index_buffer.index_size; //vc4 only support 8-bit or 16-bit index size, vulkan is 16-bit or 32-bit
   info.mode = vk_to_pipe_prim_type[cmd_buffer->state.pipeline->topology];

   info.index_bias = vertexOffset;
   info.min_index = firstIndex;
   info.max_index = firstIndex + indexCount;

   info.index.resource = vc4_buffer_from_handle(cmd_buffer->state.index_buffer.buffer);
   info.index.idx_offset = cmd_buffer->state.index_buffer.offset;

   cmd_buffer_draw(cmd_buffer, &info);
}

static uint8_t
get_index_size(VkIndexType index_type)
{
   switch (index_type) {
   case VK_INDEX_TYPE_UINT16:
      return 2;
      break;
   case VK_INDEX_TYPE_UINT32:
      return 4;
      break;
   default:
      unreachable("Unsupported index type");
   }
}

void
vc4_CmdBindIndexBuffer(VkCommandBuffer commandBuffer,
                        VkBuffer buffer,
                        VkDeviceSize offset,
                        VkIndexType indexType)
{
   VC4_FROM_HANDLE(vc4_cmd_buffer, cmd_buffer, commandBuffer);

   const uint8_t index_size = get_index_size(indexType);

   cmd_buffer->state.index_buffer.buffer = buffer;
   cmd_buffer->state.index_buffer.offset = offset;
   cmd_buffer->state.index_buffer.index_size = index_size;

   cmd_buffer->state.dirty &= ~VC4_CMD_DIRTY_INDEX_BUFFER;
}

void
vc4_CmdPushConstants(VkCommandBuffer commandBuffer,
                      VkPipelineLayout layout,
                      VkShaderStageFlags stageFlags,
                      uint32_t offset,
                      uint32_t size,
                      const void *pValues)
{
   VC4_FROM_HANDLE(vc4_cmd_buffer, cmd_buffer, commandBuffer);

   assert(offset + size < MAX_PUSH_CONSTANT_SIZE);

   if (!memcmp(cmd_buffer->push_constants_data + offset, pValues, size))
      return;

   memcpy((void*) cmd_buffer->push_constants_data + offset, pValues, size);

   cmd_buffer->state.dirty |= VC4_CMD_DIRTY_PUSH_CONSTANTS;
}

void
vc4_CmdSetViewport(VkCommandBuffer commandBuffer,
                    uint32_t firstViewport,
                    uint32_t viewportCount,
                    const VkViewport *pViewports)
{
   VC4_FROM_HANDLE(vc4_cmd_buffer, cmd_buffer, commandBuffer);
   struct vc4_cmd_buffer_state *state = &cmd_buffer->state;
   const uint32_t total_count = firstViewport + viewportCount;

   assert(firstViewport < MAX_VIEWPORTS);
   assert(total_count >= 1 && total_count <= MAX_VIEWPORTS);

   if (state->dynamic.viewport.count < total_count)
      state->dynamic.viewport.count = total_count;

   if (!memcmp(state->dynamic.viewport.viewports + firstViewport,
               pViewports, viewportCount * sizeof(*pViewports))) {
      return;
   }

   memcpy(state->dynamic.viewport.viewports + firstViewport, pViewports,
          viewportCount * sizeof(*pViewports));

   for (uint32_t i = firstViewport; i < total_count; i++) {
      vc4_viewport_compute_xform(&state->dynamic.viewport.viewports[i],
                                  state->dynamic.viewport.scale[i],
                                  state->dynamic.viewport.translate[i]);
   }

   cmd_buffer->state.dirty |= VC4_CMD_DIRTY_VIEWPORT;
}

void
vc4_CmdSetScissor(VkCommandBuffer commandBuffer,
                   uint32_t firstScissor,
                   uint32_t scissorCount,
                   const VkRect2D *pScissors)
{
   VC4_FROM_HANDLE(vc4_cmd_buffer, cmd_buffer, commandBuffer);
   struct vc4_cmd_buffer_state *state = &cmd_buffer->state;

   assert(firstScissor < MAX_SCISSORS);
   assert(firstScissor + scissorCount >= 1 &&
          firstScissor + scissorCount <= MAX_SCISSORS);

   if (state->dynamic.scissor.count < firstScissor + scissorCount)
      state->dynamic.scissor.count = firstScissor + scissorCount;

   if (!memcmp(state->dynamic.scissor.scissors + firstScissor,
               pScissors, scissorCount * sizeof(*pScissors))) {
      return;
   }

   memcpy(state->dynamic.scissor.scissors + firstScissor, pScissors,
          scissorCount * sizeof(*pScissors));

   cmd_buffer->state.dirty |= VC4_CMD_DIRTY_SCISSOR;
}

void
vc4_CmdSetStencilCompareMask(VkCommandBuffer commandBuffer,
                             VkStencilFaceFlags faceMask,
                             uint32_t compareMask)
{
   VC4_FROM_HANDLE(vc4_cmd_buffer, cmd_buffer, commandBuffer);

   if (faceMask & VK_STENCIL_FACE_FRONT_BIT)
      cmd_buffer->state.dynamic.stencil_compare_mask.front = compareMask & 0xff;
   if (faceMask & VK_STENCIL_FACE_BACK_BIT)
      cmd_buffer->state.dynamic.stencil_compare_mask.back = compareMask & 0xff;

   cmd_buffer->state.dirty |= VC4_CMD_DIRTY_STENCIL_COMPARE_MASK;
}

void
vc4_CmdSetStencilWriteMask(VkCommandBuffer commandBuffer,
                           VkStencilFaceFlags faceMask,
                           uint32_t writeMask)
{
   VC4_FROM_HANDLE(vc4_cmd_buffer, cmd_buffer, commandBuffer);

   if (faceMask & VK_STENCIL_FACE_FRONT_BIT)
      cmd_buffer->state.dynamic.stencil_write_mask.front = writeMask & 0xff;
   if (faceMask & VK_STENCIL_FACE_BACK_BIT)
      cmd_buffer->state.dynamic.stencil_write_mask.back = writeMask & 0xff;

   cmd_buffer->state.dirty |= VC4_CMD_DIRTY_STENCIL_WRITE_MASK;
}

void
vc4_CmdSetStencilReference(VkCommandBuffer commandBuffer,
                           VkStencilFaceFlags faceMask,
                           uint32_t reference)
{
   VC4_FROM_HANDLE(vc4_cmd_buffer, cmd_buffer, commandBuffer);

   if (faceMask & VK_STENCIL_FACE_FRONT_BIT)
      cmd_buffer->state.dynamic.stencil_reference.front = reference & 0xff;
   if (faceMask & VK_STENCIL_FACE_BACK_BIT)
      cmd_buffer->state.dynamic.stencil_reference.back = reference & 0xff;

   cmd_buffer->state.dirty |= VC4_CMD_DIRTY_STENCIL_REFERENCE;
}

void
vc4_CmdPipelineBarrier(VkCommandBuffer commandBuffer,
                       VkPipelineStageFlags srcStageMask,
                       VkPipelineStageFlags dstStageMask,
                       VkDependencyFlags dependencyFlags,
                       uint32_t memoryBarrierCount,
                       const VkMemoryBarrier *pMemoryBarriers,
                       uint32_t bufferBarrierCount,
                       const VkBufferMemoryBarrier *pBufferBarriers,
                       uint32_t imageBarrierCount,
                       const VkImageMemoryBarrier *pImageBarriers)
{
   // VC4_FROM_HANDLE(vc4_cmd_buffer, cmd_buffer, commandBuffer);

   /* We only care about barriers between GPU jobs */
   if (srcStageMask == VK_PIPELINE_STAGE_HOST_BIT ||
       dstStageMask == VK_PIPELINE_STAGE_HOST_BIT) {
      return;
   }
}
